In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import os
import warnings
warnings.filterwarnings('ignore')
In [2]:
os.getcwd()
Out[2]:
'/Users/saikrishnaganta'
In [3]:
path = '/Users/saikrishnaganta/Acmegrade/Data Science/Projects/marketig department'
os.getcwd()
Out[3]:
'/Users/saikrishnaganta'
In [4]:
creditcard_df = pd.read_csv('Marketing_data.csv')
display (creditcard_df )
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | C19186 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | C19187 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | NaN | 0.000000 | 6 |
| 8947 | C19188 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | C19189 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | C19190 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 18 columns
In [5]:
creditcard_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8950 entries, 0 to 8949 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUST_ID 8950 non-null object 1 BALANCE 8950 non-null float64 2 BALANCE_FREQUENCY 8950 non-null float64 3 PURCHASES 8950 non-null float64 4 ONEOFF_PURCHASES 8950 non-null float64 5 INSTALLMENTS_PURCHASES 8950 non-null float64 6 CASH_ADVANCE 8950 non-null float64 7 PURCHASES_FREQUENCY 8950 non-null float64 8 ONEOFF_PURCHASES_FREQUENCY 8950 non-null float64 9 PURCHASES_INSTALLMENTS_FREQUENCY 8950 non-null float64 10 CASH_ADVANCE_FREQUENCY 8950 non-null float64 11 CASH_ADVANCE_TRX 8950 non-null int64 12 PURCHASES_TRX 8950 non-null int64 13 CREDIT_LIMIT 8949 non-null float64 14 PAYMENTS 8950 non-null float64 15 MINIMUM_PAYMENTS 8637 non-null float64 16 PRC_FULL_PAYMENT 8950 non-null float64 17 TENURE 8950 non-null int64 dtypes: float64(14), int64(3), object(1) memory usage: 1.2+ MB
In [6]:
creditcard_df.describe()
Out[6]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
In [7]:
# Check who made one off purchase of $40761 ie maximum ONEOFF_PURCHASES
creditcard_df[creditcard_df['ONEOFF_PURCHASES'] == 40761.25]
Out[7]:
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 550 | C10574 | 11547.52001 | 1.0 | 49039.57 | 40761.25 | 8278.32 | 558.166886 | 1.0 | 1.0 | 0.916667 | 0.083333 | 1 | 101 | 22500.0 | 46930.59824 | 2974.069421 | 0.25 | 12 |
In [8]:
creditcard_df['CASH_ADVANCE'].max()
Out[8]:
47137.21176
In [9]:
# Check who made cash advance of $47137
# This customer made 123 cash advance transactions
# Never paid credit card in full
creditcard_df[creditcard_df['CASH_ADVANCE'] == 47137.211760000006]
Out[9]:
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE |
|---|
In [10]:
# Check for missing Data
creditcard_df.isnull().sum()
Out[10]:
CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
In [11]:
# Heat map for missing data
sns.heatmap(creditcard_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
Out[11]:
<Axes: >
In [12]:
# Fill up the missing elements with mean of the 'MINIMUM_PAYMENT'
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard_df['MINIMUM_PAYMENTS'].mean()
In [13]:
# Fill up the missing elements with mean of the 'CREDIT_LIMIT'
creditcard_df.loc[(creditcard_df['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditcard_df['CREDIT_LIMIT'].mean()
In [14]:
creditcard_df.duplicated().sum()
Out[14]:
0
In [15]:
creditcard_df.isnull().sum()
Out[15]:
CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
In [16]:
sns.heatmap(creditcard_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
Out[16]:
<Axes: >
In [17]:
# Check for duplicated entries in the data
creditcard_df.duplicated().sum()
Out[17]:
0
In [18]:
# Drop Customer ID since it has no meaning here
creditcard_df.drop("CUST_ID", axis = 1, inplace= True)
display(creditcard_df)
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | 864.206542 | 0.000000 | 6 |
| 8947 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 17 columns
In [19]:
n = len(creditcard_df.columns)
display(n)
17
In [20]:
display (creditcard_df.columns)
Index(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
'TENURE'],
dtype='object')
In [21]:
# distplot combines the matplotlib.hist function with seaborn kdeplot()
# KDE Plot represents the Kernel Density Estimate
# KDE is used for visualizing the Probability Density of a continuous variable.
# KDE demonstrates the probability density at different values in a continuous variable.
# Mean of balance is $1500
# 'Balance_Frequency' for most customers is updated frequently ~1
# For 'PURCHASES_FREQUENCY', there are two distinct group of customers
# For 'ONEOFF_PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY' most users don't do one off puchases or installment purchases frequently
# Very small number of customers pay their balance in full 'PRC_FULL_PAYMENT'~0
# Credit limit average is around $4500
# Most customers are ~11 years tenure
plt.figure(figsize=(20,80))
for i in range(len(creditcard_df.columns)):
plt.subplot(17, 1, i+1)
sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws={"color": "b", "lw": 3, "label": "KDE"}, hist_kws={"color": "g"})
plt.title(creditcard_df.columns[i])
plt.tight_layout()
In [22]:
sns.pairplot(creditcard_df)
# Correlation between 'PURCHASES' and ONEOFF_PURCHASES & INSTALMENT_PURCHASES
# Trend between 'PURCHASES' and 'CREDIT_LIMIT' & 'PAYMENTS'
Out[22]:
<seaborn.axisgrid.PairGrid at 0x166322810>
In [23]:
correlations = creditcard_df.corr()
display (correlations )
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BALANCE | 1.000000 | 0.322412 | 0.181261 | 0.164350 | 0.126469 | 0.496692 | -0.077944 | 0.073166 | -0.063186 | 0.449218 | 0.385152 | 0.154338 | 0.531267 | 0.322802 | 0.394282 | -0.318959 | 0.072692 |
| BALANCE_FREQUENCY | 0.322412 | 1.000000 | 0.133674 | 0.104323 | 0.124292 | 0.099388 | 0.229715 | 0.202415 | 0.176079 | 0.191873 | 0.141555 | 0.189626 | 0.095795 | 0.065008 | 0.114249 | -0.095082 | 0.119776 |
| PURCHASES | 0.181261 | 0.133674 | 1.000000 | 0.916845 | 0.679896 | -0.051474 | 0.393017 | 0.498430 | 0.315567 | -0.120143 | -0.067175 | 0.689561 | 0.356959 | 0.603264 | 0.093515 | 0.180379 | 0.086288 |
| ONEOFF_PURCHASES | 0.164350 | 0.104323 | 0.916845 | 1.000000 | 0.330622 | -0.031326 | 0.264937 | 0.524891 | 0.127729 | -0.082628 | -0.046212 | 0.545523 | 0.319721 | 0.567292 | 0.048597 | 0.132763 | 0.064150 |
| INSTALLMENTS_PURCHASES | 0.126469 | 0.124292 | 0.679896 | 0.330622 | 1.000000 | -0.064244 | 0.442418 | 0.214042 | 0.511351 | -0.132318 | -0.073999 | 0.628108 | 0.256496 | 0.384084 | 0.131687 | 0.182569 | 0.086143 |
| CASH_ADVANCE | 0.496692 | 0.099388 | -0.051474 | -0.031326 | -0.064244 | 1.000000 | -0.215507 | -0.086754 | -0.177070 | 0.628522 | 0.656498 | -0.075850 | 0.303983 | 0.453238 | 0.139223 | -0.152935 | -0.068312 |
| PURCHASES_FREQUENCY | -0.077944 | 0.229715 | 0.393017 | 0.264937 | 0.442418 | -0.215507 | 1.000000 | 0.501343 | 0.862934 | -0.308478 | -0.203478 | 0.568430 | 0.119778 | 0.103464 | 0.002976 | 0.305802 | 0.061506 |
| ONEOFF_PURCHASES_FREQUENCY | 0.073166 | 0.202415 | 0.498430 | 0.524891 | 0.214042 | -0.086754 | 0.501343 | 1.000000 | 0.142329 | -0.111716 | -0.069088 | 0.544869 | 0.295030 | 0.243537 | -0.029963 | 0.157531 | 0.082466 |
| PURCHASES_INSTALLMENTS_FREQUENCY | -0.063186 | 0.176079 | 0.315567 | 0.127729 | 0.511351 | -0.177070 | 0.862934 | 0.142329 | 1.000000 | -0.262958 | -0.169207 | 0.529975 | 0.060752 | 0.085551 | 0.029590 | 0.250087 | 0.073275 |
| CASH_ADVANCE_FREQUENCY | 0.449218 | 0.191873 | -0.120143 | -0.082628 | -0.132318 | 0.628522 | -0.308478 | -0.111716 | -0.262958 | 1.000000 | 0.799561 | -0.131168 | 0.132616 | 0.183192 | 0.097898 | -0.249773 | -0.133372 |
| CASH_ADVANCE_TRX | 0.385152 | 0.141555 | -0.067175 | -0.046212 | -0.073999 | 0.656498 | -0.203478 | -0.069088 | -0.169207 | 0.799561 | 1.000000 | -0.066157 | 0.149699 | 0.255278 | 0.109185 | -0.169784 | -0.043421 |
| PURCHASES_TRX | 0.154338 | 0.189626 | 0.689561 | 0.545523 | 0.628108 | -0.075850 | 0.568430 | 0.544869 | 0.529975 | -0.131168 | -0.066157 | 1.000000 | 0.272877 | 0.370832 | 0.095858 | 0.162066 | 0.121874 |
| CREDIT_LIMIT | 0.531267 | 0.095795 | 0.356959 | 0.319721 | 0.256496 | 0.303983 | 0.119778 | 0.295030 | 0.060752 | 0.132616 | 0.149699 | 0.272877 | 1.000000 | 0.421852 | 0.125134 | 0.055671 | 0.139034 |
| PAYMENTS | 0.322802 | 0.065008 | 0.603264 | 0.567292 | 0.384084 | 0.453238 | 0.103464 | 0.243537 | 0.085551 | 0.183192 | 0.255278 | 0.370832 | 0.421852 | 1.000000 | 0.125046 | 0.112138 | 0.106136 |
| MINIMUM_PAYMENTS | 0.394282 | 0.114249 | 0.093515 | 0.048597 | 0.131687 | 0.139223 | 0.002976 | -0.029963 | 0.029590 | 0.097898 | 0.109185 | 0.095858 | 0.125134 | 0.125046 | 1.000000 | -0.139674 | 0.057257 |
| PRC_FULL_PAYMENT | -0.318959 | -0.095082 | 0.180379 | 0.132763 | 0.182569 | -0.152935 | 0.305802 | 0.157531 | 0.250087 | -0.249773 | -0.169784 | 0.162066 | 0.055671 | 0.112138 | -0.139674 | 1.000000 | -0.016486 |
| TENURE | 0.072692 | 0.119776 | 0.086288 | 0.064150 | 0.086143 | -0.068312 | 0.061506 | 0.082466 | 0.073275 | -0.133372 | -0.043421 | 0.121874 | 0.139034 | 0.106136 | 0.057257 | -0.016486 | 1.000000 |
In [24]:
f, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(correlations, annot = True)
# 'PURCHASES' have high correlation between one-off purchases, 'installment purchases, purchase transactions, credit limit and payments.
# Strong Positive Correlation between 'PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY'
Out[24]:
<Axes: >
In [25]:
display (creditcard_df)
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | 864.206542 | 0.000000 | 6 |
| 8947 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 17 columns
In [26]:
# Let's scale the data first
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(creditcard_df)
In [27]:
creditcard_df_scaled.shape
Out[27]:
(8950, 17)
In [28]:
display(pd.DataFrame(creditcard_df_scaled))
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.731989 | -0.249434 | -0.424900 | -0.356934 | -0.349079 | -0.466786 | -0.806490 | -0.678661 | -0.707313 | -0.675349 | -0.476070 | -0.511333 | -0.960433 | -0.528979 | -3.109675e-01 | -0.525551 | 0.360680 |
| 1 | 0.786961 | 0.134325 | -0.469552 | -0.356934 | -0.454576 | 2.605605 | -1.221758 | -0.678661 | -0.916995 | 0.573963 | 0.110074 | -0.591796 | 0.688639 | 0.818642 | 8.931021e-02 | 0.234227 | 0.360680 |
| 2 | 0.447135 | 0.518084 | -0.107668 | 0.108889 | -0.454576 | -0.466786 | 1.269843 | 2.673451 | -0.916995 | -0.675349 | -0.476070 | -0.109020 | 0.826062 | -0.383805 | -1.016632e-01 | -0.525551 | 0.360680 |
| 3 | 0.049099 | -1.016953 | 0.232058 | 0.546189 | -0.454576 | -0.368653 | -1.014125 | -0.399319 | -0.916995 | -0.258913 | -0.329534 | -0.551565 | 0.826062 | -0.598688 | 4.878305e-17 | -0.525551 | 0.360680 |
| 4 | -0.358775 | 0.518084 | -0.462063 | -0.347294 | -0.454576 | -0.466786 | -1.014125 | -0.399319 | -0.916995 | -0.675349 | -0.476070 | -0.551565 | -0.905464 | -0.364368 | -2.657913e-01 | -0.525551 | 0.360680 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | -0.737950 | 0.518084 | -0.333293 | -0.356934 | -0.132643 | -0.466786 | 1.269843 | -0.678661 | 1.179833 | -0.675349 | -0.476070 | -0.350408 | -0.960433 | -0.486217 | -3.498541e-01 | 1.183951 | -4.122768 |
| 8946 | -0.742423 | 0.518084 | -0.329136 | -0.356934 | -0.122823 | -0.466786 | 1.269843 | -0.678661 | 1.179833 | -0.675349 | -0.476070 | -0.350408 | -0.960433 | -0.503396 | 4.878305e-17 | -0.525551 | -4.122768 |
| 8947 | -0.740398 | -0.185477 | -0.401965 | -0.356934 | -0.294893 | -0.466786 | 0.854576 | -0.678661 | 0.760469 | -0.675349 | -0.476070 | -0.390639 | -0.960433 | -0.570615 | -3.354655e-01 | 0.329200 | -4.122768 |
| 8948 | -0.745174 | -0.185477 | -0.469552 | -0.356934 | -0.454576 | -0.449352 | -1.221758 | -0.678661 | -0.916995 | 0.157527 | -0.182998 | -0.591796 | -1.097856 | -0.580536 | -3.469065e-01 | 0.329200 | -4.122768 |
| 8949 | -0.572575 | -0.889033 | 0.042146 | 0.301732 | -0.454576 | -0.406205 | 0.439310 | 1.556082 | -0.916995 | 0.990398 | -0.182998 | 0.333524 | -0.905464 | -0.576869 | -3.329464e-01 | -0.525551 | -4.122768 |
8950 rows × 17 columns
In [29]:
scores_1 = []
range_values = range(1, 20)
for i in range_values:
kmeans = KMeans(n_clusters = i)
kmeans.fit(creditcard_df_scaled)
scores_1.append(kmeans.inertia_)
plt.plot(scores_1, 'bx-')
plt.title('Finding the right number of clusters')
plt.xlabel('Clusters')
plt.ylabel('Scores')
plt.show()
# From this we can observe that, 4th cluster seems to be forming the elbow of the curve.
# However, the values does not reduce linearly until 8th cluster.
# Let's choose the number of clusters to be 7.
In [30]:
kmeans = KMeans(8)
kmeans.fit(creditcard_df_scaled)
labels = kmeans.labels_
In [31]:
kmeans.cluster_centers_.shape
Out[31]:
(8, 17)
In [32]:
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [creditcard_df.columns])
display(cluster_centers )
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.710404 | -0.348568 | -0.069657 | -0.203109 | 0.208006 | -0.444008 | 0.579687 | -0.194433 | 0.582219 | -0.641666 | -0.455041 | -0.032406 | -0.049286 | -0.206569 | -0.297009 | 2.251728 | 0.130890 |
| 1 | -0.226566 | 0.254336 | -0.043341 | -0.203397 | 0.271103 | -0.335812 | 0.957736 | -0.258514 | 1.131312 | -0.407835 | -0.321458 | 0.193606 | -0.296811 | -0.208180 | 0.086496 | -0.306662 | 0.278313 |
| 2 | 2.058951 | 0.392258 | -0.074157 | -0.064124 | -0.057840 | 2.958244 | -0.214903 | -0.054070 | -0.162081 | 2.313287 | 2.736012 | -0.059729 | 1.351496 | 1.513565 | 0.741851 | -0.348590 | -0.019274 |
| 3 | 1.526356 | 0.405003 | 7.356829 | 6.461232 | 5.521867 | 0.021953 | 1.075498 | 1.841044 | 1.035877 | -0.314437 | -0.112661 | 4.864643 | 2.242065 | 5.069151 | 1.204065 | 0.810717 | 0.330790 |
| 4 | 0.752829 | 0.404591 | -0.365564 | -0.254346 | -0.397105 | 0.625632 | -0.883494 | -0.418778 | -0.762496 | 1.102542 | 0.738791 | -0.477212 | 0.238244 | 0.029338 | 0.238956 | -0.442429 | 0.199499 |
| 5 | -0.365914 | -0.394812 | -0.323888 | -0.205536 | -0.387993 | -0.299175 | -0.747326 | -0.311164 | -0.716239 | -0.336099 | -0.302707 | -0.453405 | -0.358020 | -0.280702 | -0.168230 | -0.381427 | 0.273280 |
| 6 | -0.343027 | -0.469088 | -0.291771 | -0.207558 | -0.308181 | 0.056474 | -0.254356 | -0.273245 | -0.283325 | 0.288114 | -0.014683 | -0.396879 | -0.559378 | -0.397704 | -0.202052 | -0.097432 | -3.200486 |
| 7 | 0.230723 | 0.451661 | 1.089843 | 1.027724 | 0.688834 | -0.281540 | 1.131817 | 1.934360 | 0.618381 | -0.373240 | -0.294548 | 1.369933 | 0.777314 | 0.478822 | -0.023544 | 0.329919 | 0.307844 |
In [33]:
# In order to understand what these numbers mean, perform inverse transformation
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [creditcard_df.columns])
display(cluster_centers)
# First Customers cluster (Transactors): Those are customers who pay least amount of intrerest charges and careful with their money, Cluster with lowest balance ($104) and cash advance ($303), Percentage of full payment = 23%
# Second customers cluster (revolvers) who use credit card as a loan (most lucrative sector): highest balance ($5000) and cash advance (~$5000), low purchase frequency, high cash advance frequency (0.5), high cash advance transactions (16) and low percentage of full payment (3%)
# Third customer cluster (VIP/Prime): high credit limit $16K and highest percentage of full payment, target for increase credit limit and increase spending habits
# Fourth customer cluster (low tenure): these are customers with low tenure (7 years), low balanc
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 85.827900 | 0.794698 | 854.381834 | 255.317927 | 599.165251 | 47.765379 | 0.723007 | 0.144455 | 0.595826 | 0.006740 | 0.143508 | 13.904328 | 4315.128564 | 1135.147894 | 172.039624 | 0.812306 | 11.692483 |
| 1 | 1092.896102 | 0.937521 | 910.605937 | 254.840409 | 656.222517 | 274.658079 | 0.874736 | 0.125338 | 0.814050 | 0.053532 | 1.055114 | 19.522159 | 3414.528811 | 1130.484192 | 1065.782352 | 0.064021 | 11.889773 |
| 2 | 5850.006716 | 0.970193 | 844.768305 | 486.004600 | 358.763705 | 7182.446638 | 0.404100 | 0.186328 | 0.300022 | 0.598057 | 21.920097 | 13.225182 | 9411.743341 | 6114.767624 | 2593.058885 | 0.051758 | 11.491525 |
| 3 | 4741.455043 | 0.973212 | 16721.182533 | 11316.758933 | 5404.423600 | 1024.907342 | 0.922000 | 0.751677 | 0.776121 | 0.072222 | 2.480000 | 135.626667 | 12652.000000 | 16407.838942 | 3670.229657 | 0.390835 | 11.960000 |
| 4 | 3131.424557 | 0.973115 | 222.171152 | 170.275114 | 51.970631 | 2290.849689 | 0.135762 | 0.077528 | 0.061402 | 0.355774 | 8.290536 | 2.848129 | 5361.277485 | 1818.075110 | 1421.083891 | 0.024312 | 11.784299 |
| 5 | 802.856116 | 0.783743 | 311.212183 | 251.289550 | 60.210203 | 351.486507 | 0.190413 | 0.109631 | 0.079786 | 0.067887 | 1.183073 | 3.439884 | 3191.827098 | 920.540430 | 472.154771 | 0.042154 | 11.883037 |
| 6 | 850.493066 | 0.766148 | 379.832326 | 247.933166 | 132.383813 | 1097.299957 | 0.388265 | 0.120943 | 0.251837 | 0.192799 | 3.148627 | 4.844911 | 2459.204819 | 581.829880 | 393.331757 | 0.125217 | 7.234249 |
| 7 | 2044.705236 | 0.984265 | 3331.670275 | 2298.249449 | 1033.971791 | 388.468841 | 0.944603 | 0.779515 | 0.610198 | 0.060455 | 1.238751 | 48.761249 | 7322.635445 | 3119.285237 | 809.337461 | 0.250210 | 11.929293 |
In [34]:
display(labels.shape) # Labels associated to each data point
display (labels.max())
display (labels.min())
(8950,)
7
0
In [35]:
y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
display(y_kmeans)
array([0, 3, 7, ..., 2, 2, 2], dtype=int32)
In [36]:
# concatenate the clusters labels to our original dataframe
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_df_cluster.head()
Out[36]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 | 5 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 | 4 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 | 7 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 | 5 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 | 5 |
In [37]:
for i in creditcard_df.columns:
plt.figure(figsize = (20,5))
for j in range(8):
plt.subplot(1,8,j+1)
cluster = creditcard_df_cluster[creditcard_df_cluster['cluster'] == j]
cluster[i].hist(bins = 20)
plt.title('{} \nCluster {} '.format(i,j))
plt.show()
In [38]:
pca = PCA(n_components=2)
principal_comp = pca.fit_transform(creditcard_df_scaled)
display(principal_comp)
array([[-1.68222026, -1.07645061],
[-1.1382949 , 2.50647666],
[ 0.96968401, -0.3835203 ],
...,
[-0.92620364, -1.8107856 ],
[-2.33655167, -0.65796601],
[-0.55642216, -0.40046712]])
In [39]:
# Create a dataframe with the two components
pca_df = pd.DataFrame(data = principal_comp, columns =['pca1','pca2'])
pca_df.head()
Out[39]:
| pca1 | pca2 | |
|---|---|---|
| 0 | -1.682220 | -1.076451 |
| 1 | -1.138295 | 2.506477 |
| 2 | 0.969684 | -0.383520 |
| 3 | -0.873628 | 0.043166 |
| 4 | -1.599434 | -0.688581 |
In [40]:
# Concatenate the clusters labels to the dataframe
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
display(pca_df)
| pca1 | pca2 | cluster | |
|---|---|---|---|
| 0 | -1.682220 | -1.076451 | 5 |
| 1 | -1.138295 | 2.506477 | 4 |
| 2 | 0.969684 | -0.383520 | 7 |
| 3 | -0.873628 | 0.043166 | 5 |
| 4 | -1.599434 | -0.688581 | 5 |
| ... | ... | ... | ... |
| 8945 | -0.359629 | -2.016145 | 6 |
| 8946 | -0.564369 | -1.639123 | 6 |
| 8947 | -0.926204 | -1.810786 | 6 |
| 8948 | -2.336552 | -0.657966 | 6 |
| 8949 | -0.556422 | -0.400467 | 6 |
8950 rows × 3 columns
In [41]:
pca_df.value_counts(pca_df.cluster)
Out[41]:
cluster 5 2753 1 1763 4 1362 7 1085 0 879 6 619 2 414 3 75 Name: count, dtype: int64
In [42]:
plt.figure(figsize=(20,10))
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','pink','yellow','gray','purple', 'black'])
plt.show()
In [43]:
from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform # This is normalizer
from keras.optimizers import SGD
encoding_dim = 7
input_df = Input(shape=(17,)) # 17 Features
# Glorot normal initializer (Xavier normal initializer) draws samples from a truncated normal distribution
x = Dense(encoding_dim, activation='relu')(input_df)
x = Dense(500, activation='relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(500, activation='relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation='relu', kernel_initializer = 'glorot_uniform')(x)
encoded = Dense(10, activation='relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation='relu', kernel_initializer = 'glorot_uniform')(encoded)
x = Dense(500, activation='relu', kernel_initializer = 'glorot_uniform')(x)
decoded = Dense(17, kernel_initializer = 'glorot_uniform')(x)
# autoencoder
autoencoder = Model(input_df, decoded)
#encoder - used for our dimention reduction
encoder = Model(input_df, encoded)
autoencoder.compile(optimizer= 'adam', loss='mean_squared_error')
In [44]:
display (creditcard_df_scaled.shape)
(8950, 17)
In [45]:
autoencoder.fit(creditcard_df_scaled, creditcard_df_scaled, batch_size = 128, epochs = 25, verbose = 1)
Epoch 1/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 20ms/step - loss: 0.6548 Epoch 2/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - loss: 0.3745 Epoch 3/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 31ms/step - loss: 0.2987 Epoch 4/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 28ms/step - loss: 0.2637 Epoch 5/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.2259 Epoch 6/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 28ms/step - loss: 0.2042 Epoch 7/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - loss: 0.1839 Epoch 8/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - loss: 0.1660 Epoch 9/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.1554 Epoch 10/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.1668 Epoch 11/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - loss: 0.1363 Epoch 12/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 28ms/step - loss: 0.1361 Epoch 13/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - loss: 0.1347 Epoch 14/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - loss: 0.1237 Epoch 15/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - loss: 0.1189 Epoch 16/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - loss: 0.1157 Epoch 17/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.1028 Epoch 18/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.1033 Epoch 19/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.0963 Epoch 20/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - loss: 0.0998 Epoch 21/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 28ms/step - loss: 0.0975 Epoch 22/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - loss: 0.0882 Epoch 23/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - loss: 0.0910 Epoch 24/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - loss: 0.0828 Epoch 25/25 70/70 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - loss: 0.0810
Out[45]:
<keras.src.callbacks.history.History at 0x2adee6240>
In [46]:
autoencoder.summary()
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 17) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense (Dense) │ (None, 7) │ 126 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 500) │ 4,000 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 500) │ 250,500 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_3 (Dense) │ (None, 2000) │ 1,002,000 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_4 (Dense) │ (None, 10) │ 20,010 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_5 (Dense) │ (None, 2000) │ 22,000 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_6 (Dense) │ (None, 500) │ 1,000,500 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_7 (Dense) │ (None, 17) │ 8,517 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 6,922,961 (26.41 MB)
Trainable params: 2,307,653 (8.80 MB)
Non-trainable params: 0 (0.00 B)
Optimizer params: 4,615,308 (17.61 MB)
In [52]:
pred = encoder.predict(creditcard_df_scaled)
display (pd.DataFrame(pred))
280/280 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.430730 | 0.0 | 0.0 | 0.529805 | 0.072232 | 0.412029 | 0.0 | 0.706223 | 0.0 |
| 1 | 0.0 | 2.012459 | 0.0 | 0.0 | 0.999295 | 1.332797 | 2.951261 | 0.0 | 0.000000 | 0.0 |
| 2 | 0.0 | 3.829622 | 0.0 | 0.0 | 0.405321 | 2.754941 | 0.750495 | 0.0 | 1.722407 | 0.0 |
| 3 | 0.0 | 2.267767 | 0.0 | 0.0 | 0.168042 | 0.845819 | 0.416191 | 0.0 | 0.376821 | 0.0 |
| 4 | 0.0 | 1.333349 | 0.0 | 0.0 | 0.462254 | 0.269638 | 0.583908 | 0.0 | 0.672038 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | 0.0 | 1.377972 | 0.0 | 0.0 | 4.348932 | 0.000000 | 1.522304 | 0.0 | 3.482508 | 0.0 |
| 8946 | 0.0 | 1.006089 | 0.0 | 0.0 | 3.135665 | 0.000000 | 1.397675 | 0.0 | 3.713872 | 0.0 |
| 8947 | 0.0 | 1.680523 | 0.0 | 0.0 | 3.097224 | 0.000000 | 1.873369 | 0.0 | 3.893394 | 0.0 |
| 8948 | 0.0 | 3.434419 | 0.0 | 0.0 | 3.554084 | 0.000000 | 3.134046 | 0.0 | 3.907337 | 0.0 |
| 8949 | 0.0 | 6.092276 | 0.0 | 0.0 | 5.141526 | 2.463204 | 4.314789 | 0.0 | 7.287122 | 0.0 |
8950 rows × 10 columns
In [53]:
pred.shape
Out[53]:
(8950, 10)
In [54]:
scores_2 = []
range_values = range(1, 20)
for i in range_values:
kmeans = KMeans(n_clusters= i)
kmeans.fit(pred)
scores_2.append(kmeans.inertia_)
plt.plot(scores_2, 'bx-')
plt.title('Finding right number of clusters')
plt.xlabel('Clusters')
plt.ylabel('scores')
plt.show()
In [55]:
plt.plot(scores_1, 'bx-', color = 'r')
plt.plot(scores_2, 'bx-', color = 'g')
Out[55]:
[<matplotlib.lines.Line2D at 0x1402b6e40>]
In [56]:
kmeans = KMeans(4)
kmeans.fit(pred)
labels = kmeans.labels_
y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
In [57]:
display(labels.shape) # Labels associated to each data point
display (labels.max())
display (labels.min())
(8950,)
3
0
In [58]:
y_kmeans = kmeans.fit_predict(pred)
display(y_kmeans)
array([0, 1, 1, ..., 1, 2, 2], dtype=int32)
In [59]:
df_cluster_dr = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
display(df_cluster_dr.head())
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 | 1 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 | 1 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 | 2 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 | 1 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 | 1 |
In [60]:
pca = PCA(n_components=2)
prin_comp = pca.fit_transform(pred)
pca_df = pd.DataFrame(data = prin_comp, columns =['pca1','pca2'])
display (pca_df.head())
| pca1 | pca2 | |
|---|---|---|
| 0 | -1.917703 | 0.101947 |
| 1 | 0.010588 | 1.234658 |
| 2 | 0.991439 | -1.104011 |
| 3 | -1.448375 | -0.136880 |
| 4 | -1.835361 | 0.073406 |
In [61]:
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()
Out[61]:
| pca1 | pca2 | cluster | |
|---|---|---|---|
| 0 | -1.917703 | 0.101947 | 1 |
| 1 | 0.010588 | 1.234658 | 1 |
| 2 | 0.991439 | -1.104011 | 2 |
| 3 | -1.448375 | -0.136880 | 1 |
| 4 | -1.835361 | 0.073406 | 1 |
In [62]:
pca_df.value_counts(pca_df.cluster)
Out[62]:
cluster 1 5281 2 2289 3 973 0 407 Name: count, dtype: int64
In [63]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','yellow'])
plt.show()
In [ ]: